Read Data

setwd('/Users/Panda/Desktop')
data =read.csv('USvideos_cleaned.csv',header=T)

Correlation

library(corrplot)
## corrplot 0.84 loaded
library(ggplot2)
corrplot.mixed(corr = cor(data[,c("category_id","views","likes","dislikes","comment_count")]))

We have high correlation between views & likes, likes & comment_count, dislikes & comment_count.

Views Vs. Likes

ggplot(data,aes(x=views,y=likes,colour=likes,size=likes)) + geom_jitter() +
  geom_smooth()+guides(fill="none")+ labs(title="Views Vs Likes")+
  theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Views Vs. Dislikes

ggplot(data,aes(x=views,y=dislikes,colour=dislikes,size=dislikes)) + geom_jitter() +
  geom_smooth()+guides(fill="none")+ labs(title="Views Vs Dislikes")+
  theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Likes Vs. Comments

ggplot(data,aes(x=likes,y=comment_count,colour=comment_count,size=comment_count)) + geom_jitter() +
  geom_smooth()+guides(fill="none")+ labs(title="Likes Vs Comments")+
  theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Dislikes Vs. Comments

ggplot(data,aes(x=dislikes,y=comment_count,colour=comment_count,size=comment_count)) + geom_jitter() +
  geom_smooth()+guides(fill="none")+ labs(title="Dislikes Vs Comments")+
  theme(legend.position = "none")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Top 10 Most Viewed Videos

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Most_Viewed_Videos = data %>% select(title,channel_title,views,likes,category_id,comment_count) %>% 
  distinct(title,.keep_all = TRUE) %>%mutate(Title = title)%>% arrange(desc(views))%>% head(10)
Most_Viewed_Videos$title = c("El Prestamo by Maluma","Fake Love official by BTS","What is Love by TWICE","Avengers:Infinity War Trailer","Perfect by Ed Sheeran","This is America by Childish Gambino","VENOM Official Trailer","The Shape of 2017 - Youtube Rewind","Nice for What by Drake","Sanju Official Trailer")
ggplot(Most_Viewed_Videos,aes(x=reorder(title,-views),y=views))+
  geom_bar(stat = "identity",aes(fill=reorder(title,-views)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))

Top 10 Most Liked Video

Most_Liked_Videos = data %>% select(title,channel_title,likes) %>% distinct(title,.keep_all = TRUE) %>%
  mutate(Title = title)%>%arrange(desc(likes))%>% head(10)
Most_Liked_Videos$title = c("Fake Love official by BTS","MIC Drop by BTS","Daydream by j-hope","Love Yourself by BTS","Avengers:Infinity War Trailer","Perfect by Ed Sheeran","Euphoria by BTS","Fake Love Extended by BTS","This is America by Childish Gambino","Airplane by j-hope")
ggplot(Most_Liked_Videos,aes(x=reorder(title,-likes),y=likes))+
  geom_bar(stat = "identity",aes(fill=reorder(title,-likes)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))

Top 10 Most Disliked Videos

Most_Disliked_Videos = data %>% select(title,channel_title,dislikes) %>% distinct(title,.keep_all = TRUE) %>%
  mutate(Title = title)%>%arrange(desc(dislikes))%>% head(10)
Most_Disliked_Videos$title = c("So Sorry by Logan Paul","The Shape of 2017 by Youtube Spotlight","Logan Paul is Back by Logan Paul","PSA from Chairman of the FCC Ajit Pai by Daily Caller","Black Ops 4 Multiplayer Reveal Trailer by Call of Duty","Suicide: Be Here Tomorrow. by Logan Paul","Fergie Performs The U.S. National Anthem by MLG Highlights","The FCC repeals its net neutrality rules by Washington Post","What is Love MV by Twice","Santa Diss Track by Logan Paul")
ggplot(Most_Disliked_Videos,aes(x=reorder(title,-dislikes),y=dislikes))+
  geom_bar(stat = "identity",aes(fill=reorder(title,-dislikes)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))

Top 10 Most Commented Videos

Most_Commented_Videos = data %>% select(title,channel_title,comment_count) %>% distinct(title,.keep_all = TRUE) %>%
  mutate(Title = title)%>%arrange(desc(comment_count))%>% head(10)
Most_Commented_Videos$title = c("So Sorry by Logan Paul","Fake Love official by BTS","The Shape of 2017 by Youtube Spotlight", "Logan Paul is Back by Logan Paul","MIC Drop by BTS","Daydream by j-hope","Suicide: Be Here Tomorrow. by Logan Paul","Melting Every Lipstick by Safiya Nygaard", "Love Yourself by BTS","Avengers:Infinity War Trailer")
ggplot(Most_Commented_Videos,aes(x=reorder(title,comment_count),y=comment_count))+
  geom_bar(stat = "identity",aes(fill=reorder(title,-comment_count)))+theme(text = element_text(size=10),axis.text.x = element_text(angle=90, hjust=1))

videos = data[,-1]
for (i in c(7,8,9,10)){
  videos[,i]  <- as.numeric(as.character(videos[,i]))
}

Top 10 Categories: Average View, like, dislike and no.comments of different categories

Cluster Analysis

library(cluster)
data_cluster = videos[,c("views","dislikes",'likes','comment_count')]
sapply(data_cluster , class)
##         views      dislikes         likes comment_count 
##     "numeric"     "numeric"     "numeric"     "numeric"
library(mice)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
set.seed(617)
data_cluster = mice::complete(mice(data_cluster))
## 
##  iter imp variable
##   1   1
##   1   2
##   1   3
##   1   4
##   1   5
##   2   1
##   2   2
##   2   3
##   2   4
##   2   5
##   3   1
##   3   2
##   3   3
##   3   4
##   3   5
##   4   1
##   4   2
##   4   3
##   4   4
##   4   5
##   5   1
##   5   2
##   5   3
##   5   4
##   5   5

hierarchical clustering is not suitable because the dataset is large so we will use k-mean

Determing number of clusters K

total within cluster sum of squares

within_ss = sapply(1:10,FUN = function(x){
  set.seed(617)
  kmeans(x = data_cluster,centers = x,iter.max = 1000,nstart = 25)$tot.withinss})
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2047450)
ggplot(data=data.frame(cluster = 1:10,within_ss),aes(x=cluster,y=within_ss))+
  geom_line(col='steelblue',size=1.2)+
  geom_point()+
  scale_x_continuous(breaks=seq(1,10,1))

ratio plot:the ratio of between cluster sum of squares and total sum of squares for a number of values of k

ratio_ss = sapply(1:10,FUN = function(x) {
  set.seed(617)
  km = kmeans(x = data_cluster,centers = x,iter.max = 1000,nstart = 25)
  km$betweenss/km$totss} )
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2047450)
ggplot(data=data.frame(cluster = 1:10,ratio_ss),aes(x=cluster,y=ratio_ss))+
  geom_line(col='steelblue',size=1.2)+
  geom_point()+
  scale_x_continuous(breaks=seq(1,10,1))

We decide to choose 3 clusters

set.seed(617)
km = kmeans(x = data_cluster,centers = 3,iter.max=10000,nstart=25)
k_segments = km$cluster
table(k_segments)
## k_segments
##     1     2     3 
##  1358 39498    93
library(stringr)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
temp = data.frame(cluster = factor(k_segments),
                  factor1 = fa(data_cluster,nfactors = 2,rotate = 'varimax')$scores[,1],
                  factor2 = fa(data_cluster,nfactors = 2,rotate = 'varimax')$scores[,2])
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
ggplot(temp,aes(x=factor1,y=factor2,col=cluster))+geom_point()

data2 = cbind(videos,k_segments)
library(dplyr)
data2 %>%
  select(views:comment_count,k_segments)%>%
  group_by(k_segments)%>%
  summarize_all(function(x) round(mean(x,na.rm=T),2))%>%
  data.frame()
##   k_segments     views      likes  dislikes comment_count
## 1          1  23960331  651853.38  36387.74      68908.13
## 2          2   1368381   47813.78   2057.70       5490.26
## 3          3 108444347 2875072.48 228910.44     381255.40

Time Manupulation

library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data$trending_date = ydm(data$trending_date)
data$publish_date = mdy(data$publish_date)
data$dif_days = as.numeric(data$trending_date-data$publish_date)

Tags wordcloud

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
corpus = Corpus(VectorSource(data$tags))
corpus = tm_map(corpus,FUN = content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(tolower)):
## transformation drops documents
corpus = tm_map(corpus,FUN = content_transformer(FUN = function(x)gsub(pattern = '[[:punct:] ]+',replacement = ' ',x = x)))
## Warning in tm_map.SimpleCorpus(corpus, FUN = content_transformer(FUN =
## function(x) gsub(pattern = "[[:punct:] ]+", : transformation drops documents
corpus = tm_map(corpus,FUN = removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, FUN = removePunctuation): transformation
## drops documents
corpus = tm_map(corpus,FUN = removeWords,c(stopwords('english')))
## Warning in tm_map.SimpleCorpus(corpus, FUN = removeWords,
## c(stopwords("english"))): transformation drops documents
corpus = tm_map(corpus,FUN = stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, FUN = stripWhitespace): transformation
## drops documents
dtm = DocumentTermMatrix(corpus)
xdtm = removeSparseTerms(dtm,sparse = 0.95)
xdtm = as.data.frame(as.matrix(xdtm))
tags=data.frame(word=colnames(xdtm), freq=colSums(xdtm))
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(tags$word,tags$freq, min.freq=10,colors=brewer.pal(6,"Dark2"),random.order = F)

Datatype conversion

Converting ‘title’ and ‘description’ from Factor to character type so apply Text Mining Technique

data$description = as.character(data$description)
data$title = as.character(data$title)

Mean and Median for title and description

Mean of characters in description

mean_char = mean(nchar(data$description)); mean_char
## [1] 1031.495

Median of characters in description

median_char = median(nchar(data$description));median_char
## [1] 827

Mean of characters in title

mean_char = mean(nchar(data$title)); mean_char
## [1] 48.57818

Median of characters in title

median_char = median(nchar(data$title));median_char
## [1] 46

Exploratory on title

Top 10 words in Title

library(tidytext)
data%>%
  unnest_tokens(input = title, output = word)%>%
  anti_join(stop_words)%>%
  select(word)%>%
  group_by(word)%>%
  summarize(count = n())%>%
  ungroup()%>%
  arrange(desc(count))%>%
  top_n(10)%>%
  ggplot(aes(x=reorder(word,count), y=count, fill=count))+
  geom_col()+
  xlab('words')+
  coord_flip()
## Joining, by = "word"
## Selecting by count

Word cloud for title

library(wordcloud)
wordcloudTitle = 
  data%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=title)%>%
  anti_join(stop_words)%>%
  group_by(word)%>%
  summarize(freq = n())%>%
  arrange(desc(freq))%>%
  ungroup()%>%
  data.frame()
## Joining, by = "word"
set.seed(617)
wordcloud(words = wordcloudTitle$word,wordcloudTitle$freq,scale=c(2.5,1.0),max.words = 100,colors=brewer.pal(9,"Spectral"))

Binary sentiment with ‘bing’

library(ggthemes)
data%>%
  group_by(category_id)%>%
  unnest_tokens(output = word, input = title)%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(sentiment)%>%
  count()%>%
  ggplot(aes(x=sentiment,y=n,fill=sentiment))+geom_col()+theme_economist()+guides(fill=F)+
  coord_flip()
## Joining, by = "word"

comments_disabled and title

data$comments_disabled[which(data$comments_disabled=='FALSE')]=0
## Warning in `[<-.factor`(`*tmp*`, which(data$comments_disabled == "FALSE"), :
## invalid factor level, NA generated
data$comments_disabled[which(data$comments_disabled=='TRUE')]=1
## Warning in `[<-.factor`(`*tmp*`, which(data$comments_disabled == "TRUE"), :
## invalid factor level, NA generated
data %>%
  select(category_id,title,comments_disabled)%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=title)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(comments_disabled,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=comments_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"

ratings_disabled and title

data$ratings_disabled[which(data$ratings_disabled=='FALSE')]=0
## Warning in `[<-.factor`(`*tmp*`, which(data$ratings_disabled == "FALSE"), :
## invalid factor level, NA generated
data$ratings_disabled[which(data$ratings_disabled=='TRUE')]=1
## Warning in `[<-.factor`(`*tmp*`, which(data$ratings_disabled == "TRUE"), :
## invalid factor level, NA generated
data %>%
  select(category_id,title,ratings_disabled)%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=title)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(ratings_disabled,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=ratings_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"

video_error_or_removed and title

data$video_error_or_removed[which(data$video_error_or_removed=='FALSE')]=0
## Warning in `[<-.factor`(`*tmp*`, which(data$video_error_or_removed ==
## "FALSE"), : invalid factor level, NA generated
data$video_error_or_removed[which(data$video_error_or_removed=='TRUE')]=1
## Warning in `[<-.factor`(`*tmp*`, which(data$video_error_or_removed == "TRUE"), :
## invalid factor level, NA generated
data %>%
  select(category_id,title,video_error_or_removed)%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=title)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(video_error_or_removed,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=video_error_or_removed,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"

Exploratory Analysis on description

Top 10 words

data%>%
  unnest_tokens(input = description, output = word)%>%
  anti_join(stop_words)%>%
  select(word)%>%
  group_by(word)%>%
  summarize(count = n())%>%
  ungroup()%>%
  arrange(desc(count))%>%
  top_n(10)%>%
  ggplot(aes(x=reorder(word,count), y=count, fill=count))+
  geom_col()+
  xlab('words')+
  coord_flip()
## Joining, by = "word"
## Selecting by count

Word cloud

wordcloudTitle = 
  data%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=description)%>%
  anti_join(stop_words)%>%
  group_by(word)%>%
  summarize(freq = n())%>%
  arrange(desc(freq))%>%
  ungroup()%>%
  data.frame()
## Joining, by = "word"
set.seed(617)
wordcloud(words = wordcloudTitle$word,wordcloudTitle$freq,scale=c(2.5,1.0),max.words = 100,colors=brewer.pal(9,"Spectral"))

comments_disabled and description

data %>%
  select(category_id,description,comments_disabled)%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=description)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(comments_disabled,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=comments_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"

ratings_disabled and description

data %>%
  select(category_id,description,ratings_disabled)%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=description)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(ratings_disabled,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=ratings_disabled,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"

video_error_or_removed and description

#Run for description
data %>%
  select(category_id,description,video_error_or_removed)%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=description)%>%
  ungroup()%>%
  inner_join(get_sentiments('bing'))%>%
  group_by(video_error_or_removed,sentiment)%>%
  summarize(n = n())%>%
  mutate(proportion = n/sum(n))%>%
  ggplot(aes(x=video_error_or_removed,y=proportion,fill=sentiment))+geom_col()+theme_economist()+coord_flip()
## Joining, by = "word"

Emotions with Lexicon NRC

Emotions in Title

data%>%
  group_by(category_id)%>%
  unnest_tokens(output = word, input = title)%>%
  inner_join(get_sentiments('nrc'))%>%
  group_by(sentiment)%>%
  count()%>%
  ggplot(aes(x=reorder(sentiment,X = n),y=n,fill=sentiment))+geom_col()+guides(fill=F)+coord_flip()+theme_wsj()
## Joining, by = "word"

Emotions in description

data%>%
  group_by(category_id)%>%
  unnest_tokens(output = word, input = description)%>%
  inner_join(get_sentiments('nrc'))%>%
  group_by(sentiment)%>%
  count()%>%
  ggplot(aes(x=reorder(sentiment,X = n),y=n,fill=sentiment))+geom_col()+guides(fill=F)+coord_flip()+theme_wsj()
## Joining, by = "word"

Binary sentiment cloud

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
## 
##     complete
wordcloudData = 
  data%>%
  group_by(category_id)%>%
  unnest_tokens(output=word,input=title)%>%
  anti_join(stop_words)%>%
  inner_join(get_sentiments('bing'))%>%
  ungroup()%>%
  count(sentiment,word,sort=T)%>%
  spread(key=sentiment,value = n,fill=0)%>%
  data.frame()
## Joining, by = "word"
## Joining, by = "word"
rownames(wordcloudData) = wordcloudData[,'word']
wordcloudData = wordcloudData[,c('positive','negative')]
set.seed(617)
comparison.cloud(term.matrix = wordcloudData,scale = c(2.0,0.5),max.words = 200, rot.per=0)

Publish Hour Heatmap

library(lubridate)
videos$publish_date = as.character(videos$publish_date)
videos$publish_hour = as.character(videos$publish_hour)
videos$publish_datetime = paste(videos$publish_date,videos$publish_hour)
videos$publish_datetime = as.POSIXlt(videos$publish_datetime,format="%m/%d/%Y %H:%M",tz=Sys.timezone())
videos$hour = hour(videos$publish_datetime)
videos$publish_date = as.Date(videos$publish_date, "%m/%d/%Y")
videos$day = weekdays(as.Date(videos$publish_date))

library(ggplot2)
ggplot(videos, aes(x = hour, y = day)) + geom_tile(aes(fill = category_id)) + scale_fill_gradient(name = 'Total Uploads', low = 'white', high = 'navy') + theme(axis.title.y = element_blank())